{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "ec8d6189",
   "metadata": {},
   "source": [
    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/LAION-AI/Open-Assistant/blob/main/notebooks/data-augmentation/movie-dialogs/convert-to-instruction-format.ipynb)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "493f2529",
   "metadata": {},
   "source": [
    "## Imports"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "65a47f83",
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_dataset\n",
    "import numpy as np\n",
    "import json\n",
    "from tqdm import tqdm\n",
    "\n",
    "IMDB = 7.0"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "480440f6",
   "metadata": {},
   "source": [
    "## Dialog templates\n",
    "Templates for converting dialogs to prompts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "fcfedd7f",
   "metadata": {},
   "outputs": [],
   "source": [
    "DIALOG_TEMPLATES = {\n",
    "    ### template for 4+ line dialogs\n",
    "    \"four_more_lines\": [\n",
    "        \"\"\"\n",
    "Here's a {template} between {char1} and {char2} in a scene from a {genre} movie\n",
    "    {dialogue1}\n",
    "User : Can you continue the {template}\n",
    "Assistant : Sure, the next dialogue for this scene could be\n",
    "    {dialogue2}\n",
    " \"\"\",\n",
    "        \"\"\"\n",
    "    {dialogue1}\n",
    "User : Can you provide more dialog assuming {genre} movie\n",
    "    {dialogue2}\n",
    "\"\"\",\n",
    "        \"\"\"\n",
    "I'm trying to complete the dialog for my characters {char1} and {char2}. Here's the {template}, Please help me complete it\n",
    "    {dialogue1}\n",
    "Assistant : Sure\n",
    "    {dialogue2}\n",
    "\"\"\",\n",
    "        \"\"\"\n",
    "User : Assume {char1} and {char2} are characters from a {genre} movie, continue the conversation between them\n",
    "    {dialogue1}\n",
    "Assistant : Sure\n",
    "    {dialogue2}\n",
    "\"\"\",\n",
    "    ],\n",
    "    ## template for 4 line dialogs\n",
    "    \"four_lines\": [\n",
    "        \"\"\"\n",
    "    {dialogue1}\n",
    "User : provide a response assuming you're {char2}\n",
    "Assistant : Sure\n",
    "    {dialogue2}\n",
    "\"\"\",\n",
    "        \"\"\"\n",
    "    {dialogue1}\n",
    "User : respond as {char2} to complete the conversation\n",
    "Assistant : Sure\n",
    "    {dialogue2}\n",
    "\"\"\",\n",
    "    ],\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2047056e",
   "metadata": {},
   "source": [
    "- Download Cornell-movies dialog dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "e413a053",
   "metadata": {},
   "outputs": [],
   "source": [
    "! wget wget https://zissou.infosci.cornell.edu/convokit/datasets/movie-corpus/movie-corpus.zip\n",
    "! unzip movie-corpus.zip -d ./Data/"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5e2aab0d",
   "metadata": {},
   "source": [
    "## Code"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "25cae04e",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_movie_dialogs():\n",
    "    with open(\"./Data/movie-corpus/utterances.jsonl\", \"r\") as json_file:\n",
    "        conversations = list(json_file)\n",
    "    speakers = json.load(open(\"./Data/movie-corpus/speakers.json\"))\n",
    "    movie_dialog_dict = {}\n",
    "    for dialog in tqdm(conversations):\n",
    "        dialog = eval(dialog.replace(\"null\", \"None\"))\n",
    "        movie_dialog_dict[dialog[\"id\"]] = {\n",
    "            \"characterName\": speakers[dialog[\"speaker\"]][\"meta\"][\"character_name\"],\n",
    "            \"text\": dialog[\"text\"],\n",
    "            \"characterID\": dialog[\"speaker\"],\n",
    "        }\n",
    "\n",
    "    return movie_dialog_dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "3b949bc7",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_dialogs(dialog_dict, start, end):\n",
    "    dialog_list = []\n",
    "    for idx in range(start, end + 1):\n",
    "        dialog_list.append(dialog_dict[f\"L{idx}\"][\"characterName\"] + \": \" + dialog_dict[f\"L{idx}\"][\"text\"])\n",
    "    num_lines = len(dialog_list)\n",
    "\n",
    "    assert num_lines >= 1, \"Number of lines should be greater than one\"\n",
    "\n",
    "    if num_lines < 6:\n",
    "        dialog1 = \"\\n    \".join(dialog_list[:-1])\n",
    "        dialog2 = dialog_list[-1]\n",
    "    else:\n",
    "        dialog_len = np.random.randint(3, (num_lines // 2) + 1)\n",
    "        dialog1 = \"\\n    \".join(dialog_list[:dialog_len])\n",
    "        dialog2 = \"\\n    \".join(dialog_list[dialog_len:])\n",
    "\n",
    "    return dialog1, dialog2\n",
    "\n",
    "\n",
    "def choose_prompt(num_lines):\n",
    "    assert num_lines >= 1, \"Number of lines should be greater than one\"\n",
    "\n",
    "    if num_lines < 6:\n",
    "        prompt = np.random.choice(DIALOG_TEMPLATES[\"four_lines\"])\n",
    "\n",
    "    else:\n",
    "        prompt = np.random.choice(DIALOG_TEMPLATES[\"four_more_lines\"])\n",
    "\n",
    "    return prompt\n",
    "\n",
    "\n",
    "def convert_to_prompts(dataset, movie_dialog_dict, output_dir=\".\", split=\"train\"):\n",
    "    with open(f\"{output_dir}/{split}.jsonl\", \"w\", encoding=\"utf8\") as output:\n",
    "        i = 0\n",
    "        while i < len(dataset[\"train\"]):\n",
    "            data = dataset[split][i]\n",
    "            if float(data[\"movieIMDBRating\"].strip()) >= IMDB:\n",
    "                max_lines = np.random.randint(7, 12)\n",
    "                lineids = [int(lineid[1:]) for lineid in data[\"utterance\"][\"LineID\"]]\n",
    "                num_lines = len(lineids)\n",
    "                char_ids = sorted([data[\"characterID1\"].strip(), data[\"characterID1\"].strip()])\n",
    "                while num_lines < max_lines:\n",
    "                    i += 1\n",
    "                    data = dataset[split][i]\n",
    "                    char_id_new = sorted([data[\"characterID1\"].strip(), data[\"characterID1\"].strip()])\n",
    "                    ## make sure that characters are the same\n",
    "                    if char_id_new == char_ids:\n",
    "                        lineids_new = [int(lineid[1:]) for lineid in data[\"utterance\"][\"LineID\"]]\n",
    "                        if lineids_new[0] == (lineids[-1] + 1):  ##ensure continuity\n",
    "                            lineids.extend(lineids_new)\n",
    "                        else:\n",
    "                            break\n",
    "                    else:\n",
    "                        break\n",
    "                    num_lines = len(lineids)\n",
    "\n",
    "                genre = \"-\".join(data[\"movieGenres\"][:2])\n",
    "                template = np.random.choice([\"dialog\", \"script\", \"play\"])\n",
    "                char1 = movie_dialog_dict[f\"L{lineids[0]}\"][\"characterName\"]\n",
    "\n",
    "                if num_lines < 6:\n",
    "                    if num_lines % 2 == 0:\n",
    "                        char2 = movie_dialog_dict[f\"L{lineids[1]}\"][\"characterName\"]\n",
    "                    else:\n",
    "                        char2 = char1\n",
    "                else:\n",
    "                    char2 = movie_dialog_dict[f\"L{lineids[1]}\"][\"characterName\"]\n",
    "\n",
    "                dialogue1, dialogue2 = get_dialogs(movie_dialog_dict, lineids[0], lineids[-1])\n",
    "                prompt = choose_prompt(num_lines)\n",
    "\n",
    "                prompt = prompt.format(\n",
    "                    char1=char1, char2=char2, dialogue1=dialogue1, dialogue2=dialogue2, genre=genre, template=template\n",
    "                )\n",
    "                output.write(f\"{json.dumps({'conversation': prompt})}\\n\")\n",
    "            i += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "3ff310fd",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|███████████████████████████████| 304713/304713 [00:54<00:00, 5628.12it/s]\n",
      "Found cached dataset cornell_movie_dialog (/home/shahul/.cache/huggingface/datasets/cornell_movie_dialog/default/0.1.0/b67b3433cf894b551cddcd82efdff0826f39b39a11d5c149e746a546a8dc85f3)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "6fee977c69a3403ebe77c4669fcb25d7",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "movie_dialog_dict = get_movie_dialogs()\n",
    "dataset = load_dataset(\"cornell_movie_dialog\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "8567ca12",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "convert_to_prompts(dataset, movie_dialog_dict)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "02315e91",
   "metadata": {},
   "source": [
    "## Upload as HF Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "dd4c05c1",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using custom data configuration default-315650f1b3e45d2e\n",
      "Found cached dataset json (/home/shahul/.cache/huggingface/datasets/json/default-315650f1b3e45d2e/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b79b4c273dc44735badf9bff51ade320",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "dataset_ = load_dataset(\"json\", data_files={\"train\": \"./train.jsonl\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "d67fa1f9",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Pushing split train to the Hub.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "7af96153f0cf45b488d14515a7529ae7",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "0baf18a08d0e48fa90484f4cd931baa2",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Creating parquet from Arrow format:   0%|          | 0/21 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "cadf382a65274fdfbaea1820c04b146e",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b10d1b17f22c4da4b0395b57d0099cde",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "dataset_.push_to_hub(\"shahules786/OA-cornell-movies-dialog\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4b4dd2df",
   "metadata": {},
   "source": [
    "## Load Dataset from HF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "f64c9b2e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "49b4ce66a511425ba2886eeb73ba0664",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading readme:   0%|          | 0.00/1.54k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using custom data configuration shahules786--OA-cornell-movies-dialog-7b3f29da4e713888\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Downloading and preparing dataset None/None to /home/shahul/.cache/huggingface/datasets/shahules786___parquet/shahules786--OA-cornell-movies-dialog-7b3f29da4e713888/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "bb19919febc74f1ab5ecfcfd54d9167b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "df11363157cf439e8e1215386d764a5e",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Downloading data:   0%|          | 0.00/4.86M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "99abed3078764c11a08d734faf405b75",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "fcda49e14a9e4e2385c7a696100fb7a0",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating train split:   0%|          | 0/20959 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dataset parquet downloaded and prepared to /home/shahul/.cache/huggingface/datasets/shahules786___parquet/shahules786--OA-cornell-movies-dialog-7b3f29da4e713888/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "9f4e3a03fed94c1b8978309ec1605bb8",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "dataset_ = load_dataset(\"shahules786/OA-cornell-movies-dialog\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "1234f33f",
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "##\n",
      "\n",
      "    BOWMAN: I didn't do that Frank. I took particular care not to freeze them.\n",
      "    POOLE: I guess you don't know your own strength, old boy.\n",
      "    BOWMAN: I guess not.\n",
      "    POOLE: I think I'll have to go out and burn them off.\n",
      "User : respond as BOWMAN to complete the conversation\n",
      "Assistant : Sure\n",
      "    BOWMAN: Roger.\n",
      "\n",
      "##\n",
      "\n",
      "    HAL: Sorry to interrupt the festivities, Dave, but I think we've got a problem.\n",
      "    BOWMAN: What is it, Hal?\n",
      "User : respond as HAL to complete the conversation\n",
      "Assistant : Sure\n",
      "    HAL: MY F.P.C. shows an impending failure of the antenna orientation unit.\n",
      "\n",
      "##\n",
      "\n",
      "I'm trying to complete the dialog for my characters BOWMAN and HAL. Here's the script, Please help me complete it\n",
      "    BOWMAN: Not now, Hal, I'd like to talk to you about something.\n",
      "    HAL: Sure, Dave, what's up?\n",
      "    BOWMAN: You know that we checked the two AO-units that you reported in imminent failure condition?\n",
      "Assistant : Sure\n",
      "    HAL: Yes, I know.\n",
      "    BOWMAN: You probably also know that we found them okay.\n",
      "    HAL: Yes, I know that. But I can assure you that they were about to fail.\n",
      "\n",
      "##\n",
      "\n",
      "Here's a play between HAL and BOWMAN in a scene from a adventure-mystery movie\n",
      "    HAL: Naturally, Dave, I'm not pleased that the AO-unit has failed, but I hope at least this has restored your confidence in my integrity and reliability. I certainly wouldn't want to be disconnected, even temporarily, as I have never been disconnected in my entire service history.\n",
      "    BOWMAN: I'm sorry about the misunderstanding, Hal.\n",
      "    HAL: Well, don't worry about it.\n",
      "User : Can you continue the play\n",
      "Assistant : Sure, the next dialogue for this scene could be\n",
      "    BOWMAN: And don't you worry about it.\n",
      "    HAL: Is your confidence in me fully restored?\n",
      "    BOWMAN: Yes, it is, Hal.\n",
      "    HAL: Well, that's a relief. You know I have the greatest enthusiasm possible for the mission.\n",
      " \n",
      "##\n",
      "\n",
      "    HAL: I suppose it's because you've been under a lot of stress, but have you forgotten that they're not supposed to be revived for another three months.\n",
      "    BOWMAN: The antenna has to be replaced.\n",
      "    HAL: Repairing the antenna is a pretty dangerous operation.\n",
      "    BOWMAN: It doesn't have to be, Hal. It's more dangerous to be out of touch with Earth. Let me have manual control, please.\n",
      "User : respond as HAL to complete the conversation\n",
      "Assistant : Sure\n",
      "    HAL: I don't really agree with you, Dave. My on-board memory store is more than capable of handling all the mission requirements.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "for i in range(10, 15):\n",
    "    print(\"##\")\n",
    "    print(dataset_[\"train\"][i][\"conversation\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "806f3ef2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['conversation'],\n",
       "    num_rows: 20959\n",
       "})"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset_[\"train\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "10506ff9",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "OpenAssistant",
   "language": "python",
   "name": "openassistant"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
